We will start with classification techniques in gradient boosted decision trees
import os
os.getcwd()
# Read in csv file for Surgical Deepnet data that is stored in path:
import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option('display.expand_frame_repr', False)
surg_df = pd.read_csv('/Users/matthewberezo/Documents/Surgicaldeepnet.csv')
pd.set_option("display.max_rows", None)
surg_df.head()
surg_df.shape
surg_df.describe()
As discussed, XGBoost needs little data preparation. Since all of our data is already numerically represented, we can simply separate it into train, validate, and test sets and throw the data into XGBoost dense matrices
# Again, we split the data into training and testing
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(surg_df.drop(columns = ['mort30']),
surg_df['mort30'],
test_size=0.2,
random_state=1)
x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size = 0.2, random_state = 1)
import xgboost as xgb
dtrain = xgb.DMatrix(data = x_train, label = y_train)
dval = xgb.DMatrix(data = x_val, label = y_val)
dtest = xgb.DMatrix(data = x_test, label = y_test)
param = {'max_depth':3,
'eta': 0.35,
'silent':1,
'objective':'binary:logistic',
'eval_metric': 'logloss'
#,'gamma': ???,
#,'lambda': ???,
#,'alpha': ???,
#,'min_child_weight': ???,
#,'colsample_bytree' :???
#,colsample_bynode' : ???
#,'scale_pos_weight' : ???
,'maximize' : 'FALSE'
,'n_jobs' : -1
#,'base_score' : ???
#,'max_delta_step' : ???
}
# specify validations set to watch performance
watchlist = [(dtrain, 'train'), (dval, 'eval')]
num_round = 25 #This is another hyperparameter of sorts
bst = xgb.train(param, dtrain, num_round, watchlist, early_stopping_rounds = 10)
mort_train_w_preds = x_train
mort_train_w_preds['xgb_probs'] = bst.predict(dtrain)
mort_test_w_preds = x_test
mort_test_w_preds['xgb_probs'] = bst.predict(dtest)
from sklearn import metrics
y = y_test
scores = mort_test_w_preds['xgb_probs']
fpr, tpr, thresholds = metrics.roc_curve(y, scores)
metrics.auc(fpr, tpr)
# We can visualize these ROC curves with matplotlib
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, roc_curve, roc_auc_score
plt.plot(roc_curve(y_train, mort_train_w_preds['xgb_probs'])[0],roc_curve(y_train, mort_train_w_preds['xgb_probs'])[1],
color = 'blue', label='Train ROC Curve (area = %0.2f)' % roc_auc_score(y_train, mort_train_w_preds['xgb_probs']))
plt.plot(roc_curve(y_test, mort_test_w_preds['xgb_probs'])[0],roc_curve(y_test, mort_test_w_preds['xgb_probs'])[1],
color = 'red', label='Test ROC Curve (area = %0.2f)' % roc_auc_score(y_test, mort_test_w_preds['xgb_probs']))
plt.plot([0, 1], [0, 1], color='black', linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.legend()
plt.show()
# Set our parameters
param_auc = {'max_depth':3,
'eta': 0.35,
'silent':1,
'objective':'binary:logistic',
'eval_metric': 'auc'
#,'gamma': ???,
#,'lambda': ???,
#,'alpha': ???,
#,'min_child_weight': ???,
#,'colsample_bytree' :???
#,colsample_bynode' : ???
#,'scale_pos_weight' : ???
,'maximize' : 'TRUE'
,'n_jobs' : -1
#,'base_score' : ???
#,'max_delta_step' : ???
}
# specify validations set to watch performance
watchlist = [(dtrain, 'train'), (dval, 'eval')]
num_round = 25 #This is another hyperparameter of sorts
bst_auc = xgb.train(param_auc, dtrain, num_round, watchlist, early_stopping_rounds = 10)
mort_train_w_preds['xgb_probs_auc'] = bst_auc.predict(dtrain)
mort_test_w_preds['xgb_probs_auc'] = bst_auc.predict(dtest)
# We can visualize these ROC curves with matplotlib
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, roc_curve, roc_auc_score
plt.plot(roc_curve(y_train, mort_train_w_preds['xgb_probs_auc'])[0],roc_curve(y_train, mort_train_w_preds['xgb_probs_auc'])[1],
color = 'blue', label='Train ROC Curve (area = %0.2f)' % roc_auc_score(y_train, mort_train_w_preds['xgb_probs_auc']))
plt.plot(roc_curve(y_test, mort_test_w_preds['xgb_probs_auc'])[0],roc_curve(y_test, mort_test_w_preds['xgb_probs_auc'])[1],
color = 'red', label='Test ROC Curve (area = %0.2f)' % roc_auc_score(y_test, mort_test_w_preds['xgb_probs_auc']))
plt.plot([0, 1], [0, 1], color='black', linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.legend()
plt.show()
# Set our parameters
param_aucpr = {'max_depth':3,
'eta': 0.01,
'silent':1,
'objective':'binary:logistic',
'eval_metric': 'aucpr'
#,'gamma': ???,
#,'lambda': ???,
#,'alpha': ???,
#,'min_child_weight': ???,
#,'colsample_bytree' :???
#,colsample_bynode' : ???
#,'scale_pos_weight' : ???
,'maximize' : 'TRUE'
,'n_jobs' : -1
#,'base_score' : ???
#,'max_delta_step' : ???
}
watchlist = [(dtrain, 'train'), (dval, 'eval')]
bst_aucpr = xgb.train(param_aucpr, dtrain, num_round, watchlist, early_stopping_rounds = 10)
mort_train_w_preds['xgb_probs_aucpr'] = bst_aucpr.predict(dtrain)
mort_test_w_preds['xgb_probs_aucpr'] = bst_aucpr.predict(dtest)
# We can visualize these ROC curves with matplotlib
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, roc_curve, roc_auc_score
plt.plot(roc_curve(y_train, mort_train_w_preds['xgb_probs_aucpr'])[0],roc_curve(y_train, mort_train_w_preds['xgb_probs_aucpr'])[1],
color = 'blue', label='Train ROC Curve (area = %0.2f)' % roc_auc_score(y_train, mort_train_w_preds['xgb_probs_aucpr']))
plt.plot(roc_curve(y_test, mort_test_w_preds['xgb_probs_aucpr'])[0],roc_curve(y_test, mort_test_w_preds['xgb_probs_aucpr'])[1],
color = 'red', label='Test ROC Curve (area = %0.2f)' % roc_auc_score(y_test, mort_test_w_preds['xgb_probs_aucpr']))
plt.plot([0, 1], [0, 1], color='black', linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.legend()
plt.show()
import lightgbm as lgb
lgb_mort_train = lgb.Dataset(x_train.drop(columns = ['xgb_probs', 'xgb_probs_auc', 'xgb_probs_aucpr'])
, y_train)
lgb_val_train = lgb.Dataset(x_val, y_val)
lgb_params = {
'boosting_type': 'gbdt',
'objective': 'binary',
'metric': 'binary_logloss',
'max_depth' : 3,
#'num_leaves' : ???
'learning_rate': 0.1,
#'num_threads' : -1,
#'scale_pos_weight' : ???
'early_stopping_round' : 10,
# min_data_in_leaf = ???,
# pos_bagging_fraction = ???,
# neg_bagging_fraction = ???,
# bagging_freq = ???,
# max_delta_step = ???,
#'top_rate' : ???
#'other_rate' : ???
#'lambda_l1' : ???
#'lambda_l2' : ???
}
lgb_gbm = lgb.train(params = lgb_params, train_set = lgb_mort_train,
num_boost_round = 100, valid_sets = [lgb_val_train, lgb_mort_train],
valid_names = ['Evaluation', 'Train'])
y_probs_train = lgb_gbm.predict(x_train.drop(columns = ['xgb_probs', 'xgb_probs_auc', 'xgb_probs_aucpr']))
y_probs_test = lgb_gbm.predict(x_test)
fpr, tpr, thresholds = metrics.roc_curve(y_train, y_probs_train)
metrics.auc(fpr, tpr)
fpr, tpr, thresholds = metrics.roc_curve(y_test, y_probs_test)
metrics.auc(fpr, tpr)
#### LightGBM Example with different evaluation metric:
lgb_params_auc = {
'boosting_type': 'gbdt',
'objective': 'binary',
'metric': 'auc',
'max_depth' : 3,
#'num_leaves' : ???
'learning_rate': 0.1,
#'num_threads' : -1,
#'scale_pos_weight' : ???
'early_stopping_round' : 10,
# min_data_in_leaf = ???,
# pos_bagging_fraction = ???,
# neg_bagging_fraction = ???,
# bagging_freq = ???,
# max_delta_step = ???,
#'top_rate' : ???
#'other_rate' : ???
#'lambda_l1' : ???
#'lambda_l2' : ???
}
lgb_gbm_auc = lgb.train(params = lgb_params_auc, train_set = lgb_mort_train,
num_boost_round = 100, valid_sets = [lgb_val_train, lgb_mort_train],
valid_names = ['Evaluation', 'Train'])
surg_df.head()
x_train_cat = x_train.drop(columns = ['xgb_probs', 'xgb_probs_auc', 'xgb_probs_aucpr'])
x_val_cat = x_val
x_test_cat = x_test.drop(columns = ['xgb_probs', 'xgb_probs_auc', 'xgb_probs_aucpr'])
x_train_cat[['asa_status', 'baseline_cancer', 'baseline_charlson', 'baseline_cvd', 'baseline_dementia',
'baseline_diabetes', 'baseline_digestive', 'baseline_osteoart', 'baseline_psych', 'baseline_pulmonary',
'dow', 'gender', 'month', 'moonphase', 'race', 'complication']] = x_train_cat[['asa_status', 'baseline_cancer', 'baseline_charlson', 'baseline_cvd', 'baseline_dementia',
'baseline_diabetes', 'baseline_digestive', 'baseline_osteoart', 'baseline_psych', 'baseline_pulmonary',
'dow', 'gender', 'month', 'moonphase', 'race', 'complication']].astype(str)
x_val_cat[['asa_status', 'baseline_cancer', 'baseline_charlson', 'baseline_cvd', 'baseline_dementia',
'baseline_diabetes', 'baseline_digestive', 'baseline_osteoart', 'baseline_psych', 'baseline_pulmonary',
'dow', 'gender', 'month', 'moonphase', 'race', 'complication']] = x_val_cat[['asa_status', 'baseline_cancer', 'baseline_charlson', 'baseline_cvd', 'baseline_dementia',
'baseline_diabetes', 'baseline_digestive', 'baseline_osteoart', 'baseline_psych', 'baseline_pulmonary',
'dow', 'gender', 'month', 'moonphase', 'race', 'complication']].astype(str)
x_test_cat[['asa_status', 'baseline_cancer', 'baseline_charlson', 'baseline_cvd', 'baseline_dementia',
'baseline_diabetes', 'baseline_digestive', 'baseline_osteoart', 'baseline_psych', 'baseline_pulmonary',
'dow', 'gender', 'month', 'moonphase', 'race', 'complication']] = x_test_cat[['asa_status', 'baseline_cancer', 'baseline_charlson', 'baseline_cvd', 'baseline_dementia',
'baseline_diabetes', 'baseline_digestive', 'baseline_osteoart', 'baseline_psych', 'baseline_pulmonary',
'dow', 'gender', 'month', 'moonphase', 'race', 'complication']].astype(str)
x_train_cat.head()
x_train_cat.nunique()
x_train.dtypes
import numpy as np
# Create index for categorical variables
predictors = x_train_cat
categorical_var = np.where(predictors.dtypes != np.float)[0]
print('\nCategorical Variables indices : ',categorical_var)
from catboost import CatBoostClassifier, Pool, cv
cat_boost_model = CatBoostClassifier(
loss_function = 'Logloss',
random_seed=42,
iterations = 10,
learning_rate = 0.03,
early_stopping_rounds = 10,
#l2_leaf_reg = ???
depth = 3
)
cat_boost_model.fit(
x_train_cat, y_train
,cat_features=categorical_var,
eval_set=(x_val_cat, y_val)
, plot = True
)
catboost_probs_train = cat_boost_model.predict_proba(x_train_cat)
catboost_probs = cat_boost_model.predict_proba(x_test_cat)
catboost_probs_df_train = pd.DataFrame(catboost_probs_train)
catboost_probs_df_train = catboost_probs_df_train.add_prefix('cat')
catboost_probs_df = pd.DataFrame(catboost_probs)
catboost_probs_df = catboost_probs_df.add_prefix('cat')
fprc, tprc, thresholds = metrics.roc_curve(y_train, catboost_probs_df_train['cat1'])
metrics.auc(fprc, tprc)
fprc, tprc, thresholds = metrics.roc_curve(y_test, catboost_probs_df['cat1'])
metrics.auc(fprc, tprc)
cat_boost_model_ce = CatBoostClassifier(
loss_function = 'CrossEntropy',
random_seed=42,
iterations = 10,
learning_rate = 0.03,
early_stopping_rounds = 10,
#l2_leaf_reg = ???
depth = 3
)
cat_boost_model_ce.fit(
x_train_cat, y_train
,cat_features=categorical_var,
eval_set=(x_val_cat, y_val)
, plot = True
)
catboost_probs_ce_train = cat_boost_model_ce.predict_proba(x_train_cat)
catboost_probs_ce = cat_boost_model_ce.predict_proba(x_test_cat)
catboost_probs_df_train_ce = pd.DataFrame(catboost_probs_ce_train)
catboost_probs_df_train_ce = catboost_probs_df_train_ce.add_prefix('cat')
catboost_probs_df_ce = pd.DataFrame(catboost_probs_ce)
catboost_probs_df_ce = catboost_probs_df_ce.add_prefix('cat')
fprc, tprc, thresholds = metrics.roc_curve(y_train, catboost_probs_df_train_ce['cat1'])
metrics.auc(fprc, tprc)
fprc, tprc, thresholds = metrics.roc_curve(y_test, catboost_probs_df_ce['cat1'])
metrics.auc(fprc, tprc)
We will use the boston housing dataset to create gradient boosted decision trees for regression
from sklearn.datasets import load_boston
house_price = pd.read_csv('/Users/matthewberezo/Documents/kaggle_housing.csv')
house_price.shape
house_price.dtypes
target = house_price['SalePrice']
house_price.head()
house_price['Alley'].unique()
from sklearn import preprocessing
house_price['MSSubClass'] = house_price['MSSubClass'].astype(str)
le_mssubclass = preprocessing.LabelEncoder()
le_mssubclass.fit(house_price['MSSubClass'])
house_price['MSSUBCLASS_2'] = le_mssubclass.transform(house_price['MSSubClass'])
house_price['MSZoning'] = house_price['MSZoning'].astype(str)
le_MSZoning = preprocessing.LabelEncoder()
le_MSZoning.fit(house_price['MSZoning'])
house_price['MSZoning_2'] = le_MSZoning.transform(house_price['MSZoning'])
house_price['Street'] = house_price['Street'].astype(str)
le_Street = preprocessing.LabelEncoder()
le_Street.fit(house_price['Street'])
house_price['Street_2'] = le_Street.transform(house_price['Street'])
house_price['Alley'] = house_price['Alley'].astype(str)
le_Alley = preprocessing.LabelEncoder()
le_Alley.fit(house_price['Alley'])
house_price['Alley_2'] = le_Alley.transform(house_price['Alley'])
house_price['LotShape'] = house_price['LotShape'].astype(str)
le_LotShape = preprocessing.LabelEncoder()
le_LotShape.fit(house_price['LotShape'])
house_price['LotShape_2'] = le_LotShape.transform(house_price['LotShape'])
house_price['LandContour'] = house_price['LandContour'].astype(str)
le_LandContour = preprocessing.LabelEncoder()
le_LandContour.fit(house_price['LandContour'])
house_price['LandContour_2'] = le_LandContour.transform(house_price['LandContour'])
house_price['Utilities'] = house_price['Utilities'].astype(str)
le_Utilities = preprocessing.LabelEncoder()
le_Utilities.fit(house_price['Utilities'])
house_price['Utilities_2'] = le_Utilities.transform(house_price['Utilities'])
house_price['LotConfig'] = house_price['LotConfig'].astype(str)
le_LotConfig = preprocessing.LabelEncoder()
le_LotConfig.fit(house_price['LotConfig'])
house_price['LotConfig_2'] = le_LotConfig.transform(house_price['LotConfig'])
house_price['LandSlope'] = house_price['LandSlope'].astype(str)
le_LandSlope = preprocessing.LabelEncoder()
le_LandSlope.fit(house_price['LandSlope'])
house_price['LandSlope_2'] = le_LandSlope.transform(house_price['LandSlope'])
house_price['Neighborhood'] = house_price['Neighborhood'].astype(str)
le_Neighborhood = preprocessing.LabelEncoder()
le_Neighborhood.fit(house_price['Neighborhood'])
house_price['Neighborhood_2'] = le_Neighborhood.transform(house_price['Neighborhood'])
house_price['Condition1'] = house_price['Condition1'].astype(str)
le_Condition1 = preprocessing.LabelEncoder()
le_Condition1.fit(house_price['Condition1'])
house_price['Condition1_2'] = le_Condition1.transform(house_price['Condition1'])
house_price['Condition2'] = house_price['Condition2'].astype(str)
le_Condition2 = preprocessing.LabelEncoder()
le_Condition2.fit(house_price['Condition2'])
house_price['Condition2_2'] = le_Condition2.transform(house_price['Condition2'])
house_price['BldgType'] = house_price['BldgType'].astype(str)
le_BldgType = preprocessing.LabelEncoder()
le_BldgType.fit(house_price['BldgType'])
house_price['BldgType_2'] = le_BldgType.transform(house_price['BldgType'])
house_price['HouseStyle'] = house_price['HouseStyle'].astype(str)
le_HouseStyle = preprocessing.LabelEncoder()
le_HouseStyle.fit(house_price['HouseStyle'])
house_price['HouseStyle_2'] = le_HouseStyle.transform(house_price['HouseStyle'])
house_price['RoofStyle'] = house_price['RoofStyle'].astype(str)
le_RoofStyle = preprocessing.LabelEncoder()
le_RoofStyle.fit(house_price['RoofStyle'])
house_price['RoofStyle_2'] = le_RoofStyle.transform(house_price['RoofStyle'])
house_price['RoofMatl'] = house_price['RoofMatl'].astype(str)
le_RoofMatl = preprocessing.LabelEncoder()
le_RoofMatl.fit(house_price['RoofMatl'])
house_price['RoofMatl_2'] = le_RoofMatl.transform(house_price['RoofMatl'])
house_price['Exterior1st'] = house_price['Exterior1st'].astype(str)
le_Exterior1st = preprocessing.LabelEncoder()
le_Exterior1st.fit(house_price['Exterior1st'])
house_price['Exterior1st_2'] = le_Exterior1st.transform(house_price['Exterior1st'])
house_price['Exterior2nd'] = house_price['Exterior2nd'].astype(str)
le_Exterior2nd = preprocessing.LabelEncoder()
le_Exterior2nd.fit(house_price['Exterior2nd'])
house_price['Exterior2nd_2'] = le_Exterior2nd.transform(house_price['Exterior2nd'])
house_price['MasVnrType'] = house_price['MasVnrType'].astype(str)
le_MasVnrType = preprocessing.LabelEncoder()
le_MasVnrType.fit(house_price['MasVnrType'])
house_price['MasVnrType_2'] = le_MasVnrType.transform(house_price['MasVnrType'])
house_price['ExterQual'] = house_price['ExterQual'].astype(str)
le_ExterQual = preprocessing.LabelEncoder()
le_ExterQual.fit(house_price['ExterQual'])
house_price['ExterQual_2'] = le_ExterQual.transform(house_price['ExterQual'])
house_price['Foundation'] = house_price['Foundation'].astype(str)
le_Foundation = preprocessing.LabelEncoder()
le_Foundation.fit(house_price['Foundation'])
house_price['Foundation_2'] = le_Foundation.transform(house_price['Foundation'])
house_price['BsmtQual'] = house_price['BsmtQual'].astype(str)
le_BsmtQual = preprocessing.LabelEncoder()
le_BsmtQual.fit(house_price['BsmtQual'])
house_price['BsmtQual_2'] = le_BsmtQual.transform(house_price['BsmtQual'])
house_price['BsmtCond'] = house_price['BsmtCond'].astype(str)
le_BsmtCond = preprocessing.LabelEncoder()
le_BsmtCond.fit(house_price['BsmtCond'])
house_price['BsmtCond_2'] = le_BsmtCond.transform(house_price['BsmtCond'])
house_price['BsmtExposure'] = house_price['BsmtExposure'].astype(str)
le_BsmtExposure = preprocessing.LabelEncoder()
le_BsmtExposure.fit(house_price['BsmtExposure'])
house_price['BsmtExposure_2'] = le_BsmtExposure.transform(house_price['BsmtExposure'])
house_price['BsmtFinType1'] = house_price['BsmtFinType1'].astype(str)
le_BsmtFinType1 = preprocessing.LabelEncoder()
le_BsmtFinType1.fit(house_price['BsmtFinType1'])
house_price['BsmtFinType1_2'] = le_BsmtFinType1.transform(house_price['BsmtFinType1'])
house_price['BsmtFinType2'] = house_price['BsmtFinType2'].astype(str)
le_BsmtFinType2 = preprocessing.LabelEncoder()
le_BsmtFinType2.fit(house_price['BsmtFinType2'])
house_price['BsmtFinType2_2'] = le_BsmtFinType2.transform(house_price['BsmtFinType2'])
house_price['Heating'] = house_price['Heating'].astype(str)
le_Heating = preprocessing.LabelEncoder()
le_Heating.fit(house_price['Heating'])
house_price['Heating_2'] = le_Heating.transform(house_price['Heating'])
house_price['HeatingQC'] = house_price['HeatingQC'].astype(str)
le_HeatingQC = preprocessing.LabelEncoder()
le_HeatingQC.fit(house_price['HeatingQC'])
house_price['HeatingQC_2'] = le_HeatingQC.transform(house_price['HeatingQC'])
house_price['CentralAir'] = house_price['CentralAir'].astype(str)
le_CentralAir = preprocessing.LabelEncoder()
le_CentralAir.fit(house_price['CentralAir'])
house_price['CentralAir_2'] = le_CentralAir.transform(house_price['CentralAir'])
house_price['Electrical'] = house_price['Electrical'].astype(str)
le_Electrical = preprocessing.LabelEncoder()
le_Electrical.fit(house_price['Electrical'])
house_price['Electrical_2'] = le_Electrical.transform(house_price['Electrical'])
house_price['KitchenQual'] = house_price['KitchenQual'].astype(str)
le_KitchenQual = preprocessing.LabelEncoder()
le_KitchenQual.fit(house_price['KitchenQual'])
house_price['KitchenQual_2'] = le_KitchenQual.transform(house_price['KitchenQual'])
house_price['FireplaceQu'] = house_price['FireplaceQu'].astype(str)
le_FireplaceQu = preprocessing.LabelEncoder()
le_FireplaceQu.fit(house_price['FireplaceQu'])
house_price['FireplaceQu_2'] = le_FireplaceQu.transform(house_price['FireplaceQu'])
house_price['GarageType'] = house_price['GarageType'].astype(str)
le_GarageType = preprocessing.LabelEncoder()
le_GarageType.fit(house_price['GarageType'])
house_price['GarageType_2'] = le_GarageType.transform(house_price['GarageType'])
house_price['GarageFinish'] = house_price['GarageFinish'].astype(str)
le_GarageFinish = preprocessing.LabelEncoder()
le_GarageFinish.fit(house_price['GarageFinish'])
house_price['GarageFinish_2'] = le_GarageFinish.transform(house_price['GarageFinish'])
house_price['GarageQual'] = house_price['GarageQual'].astype(str)
le_GarageQual = preprocessing.LabelEncoder()
le_GarageQual.fit(house_price['GarageQual'])
house_price['GarageQual_2'] = le_GarageQual.transform(house_price['GarageQual'])
house_price['GarageCond'] = house_price['GarageCond'].astype(str)
le_GarageCond = preprocessing.LabelEncoder()
le_GarageCond.fit(house_price['GarageCond'])
house_price['GarageCond_2'] = le_GarageCond.transform(house_price['GarageCond'])
house_price['PavedDrive'] = house_price['PavedDrive'].astype(str)
le_PavedDrive = preprocessing.LabelEncoder()
le_PavedDrive.fit(house_price['PavedDrive'])
house_price['PavedDrive_2'] = le_PavedDrive.transform(house_price['PavedDrive'])
house_price.head()
i_vars = house_price[['LotFrontage', 'LotArea', 'OverallQual', 'OverallCond',
'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1',
'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'GrLivArea', 'BsmtFullBath'
,'BsmtHalfBath', 'FullBath', 'HalfBath', 'BedroomAbvGr'
,'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces', 'GarageYrBlt',
'GarageCars', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch'
,'3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal', 'MoSold', 'YrSold'
,'MSSUBCLASS_2', 'MSZoning_2', 'Street_2', 'Alley_2', 'LotShape_2'
,'LandContour_2', 'Utilities_2', 'LotConfig_2', 'LandSlope_2'
,'Neighborhood_2',
'Condition1_2', 'Condition2_2', 'BldgType_2', 'HouseStyle_2'
,'RoofStyle_2', 'RoofMatl_2', 'Exterior1st_2', 'Exterior2nd_2',
'MasVnrType_2', 'ExterQual_2', 'Foundation_2', 'BsmtQual_2', 'BsmtCond_2',
'BsmtExposure_2', 'BsmtFinType1_2', 'BsmtFinType2_2', 'Heating_2',
'HeatingQC_2', 'CentralAir_2', 'Electrical_2', 'KitchenQual_2',
'FireplaceQu_2', 'GarageType_2', 'GarageFinish_2', 'GarageQual_2',
'GarageCond_2', 'PavedDrive_2'
]]
target = house_price['SalePrice']
# Split new dataset into training and test algorithms
x_trainr, x_testr, y_trainr, y_testr = train_test_split(i_vars,
target,
test_size=0.2,
random_state=1)
x_trainr, x_valr, y_trainr, y_valr = train_test_split(x_trainr, y_trainr, test_size = 0.2, random_state = 1)
x_trainr.shape
dtrainr = xgb.DMatrix(data = x_trainr, label = y_trainr)
dvalr = xgb.DMatrix(data = x_valr, label = y_valr)
dtestr = xgb.DMatrix(data = x_testr, label = y_testr)
param_r = {'booster' : 'gblinear'
#,'lambda' = ???
#,'alpha' = ???
,'feature_selector' : 'cyclic' #also have 'shuffle', 'random', 'greedy', 'thrifty'
#, 'top_k' : ??? # only available for greedy and thrifty selector
, 'objective' : 'reg:squarederror' #also have 'squaredlogerror'
, 'eval_metric' : 'rmse' # also have 'rmsle',
, 'maximize' : 'FALSE'
}
watchlist = [(dtrainr, 'train'), (dvalr, 'eval')]
num_round = 25 #This is another hyperparameter of sorts
xgb_r = xgb.train(param_r, dtrainr, num_round, watchlist, early_stopping_rounds = 10)
house_price_train_preds = x_trainr
house_price_train_preds['price_pred'] = xgb_r.predict(dtrainr)
house_price_test_preds = x_testr
house_price_test_preds['price_pred'] = xgb_r.predict(dtestr)
house_price_train_preds['PRICE'] = y_trainr
house_price_test_preds['PRICE'] = y_testr
house_price_test_preds['ERROR'] = house_price_test_preds['PRICE'] - house_price_test_preds['price_pred']
from sklearn.metrics import r2_score
print("Train R2 =", r2_score(house_price_train_preds['PRICE'], house_price_train_preds['price_pred'])
,"Test R2 =", r2_score(house_price_test_preds['PRICE'], house_price_test_preds['price_pred']))
import plotly.express as px
fig = px.scatter(house_price_test_preds, x="price_pred", y="PRICE" #color = #,
,hover_data=['ERROR', 'PRICE']
)
fig.show()
lgb_params_r = {
'boosting_type': 'gbdt', # also have goss, and dart
'objective': 'regression',
'metric': 'l1', # also 'mean_absolute_error', 'mae', 'root_mean_squared_error'
'max_depth' : 3,
#'num_leaves' : ???
'learning_rate': 0.1,
#'num_threads' : -1,
#'scale_pos_weight' : ???
'early_stopping_round' : 10,
# min_data_in_leaf = ???,
# pos_bagging_fraction = ???,
# neg_bagging_fraction = ???,
# bagging_freq = ???,
# max_delta_step = ???,
#'top_rate' : ???
#'other_rate' : ???
#'lambda_l1' : ???
#'lambda_l2' : ???
}
lgb_house_train = lgb.Dataset(x_trainr.drop(columns = ['price_pred', 'PRICE']), y_trainr)
lgb_house_val = lgb.Dataset(x_valr, y_valr)
lgb_gbm_reg = lgb.train(params = lgb_params_r, train_set = lgb_house_train,
num_boost_round = 100, valid_sets = [lgb_house_val, lgb_house_train],
valid_names = ['Evaluation', 'Train'])
house_price_train_preds['price_pred_lgb'] = lgb_gbm_reg.predict(x_trainr)
house_price_test_preds['price_pred_lgb'] = lgb_gbm_reg.predict(x_testr)
house_price_test_preds['ERROR_lgb'] = house_price_test_preds['PRICE'] - house_price_test_preds['price_pred_lgb']
print("LGB Train R2 =", r2_score(house_price_train_preds['PRICE'], house_price_train_preds['price_pred_lgb'])
,"LGB Test R2 =", r2_score(house_price_test_preds['PRICE'], house_price_test_preds['price_pred_lgb']))
import plotly.express as px
fig = px.scatter(house_price_test_preds, x="price_pred_lgb", y="PRICE" #color = #,
,hover_data=['ERROR_lgb', 'PRICE']
)
fig.show()
# We will reuse our wine dataset for multinomial classification:
wine_df = pd.read_csv('/Users/matthewberezo/Documents/wineQualityReds.csv')
wine_df = wine_df.drop(['Unnamed: 0'], axis=1)
wine_df.head()
wine_df.shape
wine_df['quality'].unique()
wine_df['quality'] = wine_df['quality'] - 3
wine_df['quality'].unique()
x_train, x_test, y_train, y_test = train_test_split(wine_df,
wine_df['quality'],
test_size=0.2,
random_state=1)
x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size = 0.2, random_state = 1)
xgb_param_mn = {'max_depth':3,
'eta': 0.35,
'silent':1,
'objective':'multi:softprob', # also have multi:softmax --> need to set "num_class" if this is used
'eval_metric': 'mlogloss',
'num_class' : 6
#,'gamma': ???,
#,'lambda': ???,
#,'alpha': ???,
#,'min_child_weight': ???,
#,'colsample_bytree' :???
#,colsample_bynode' : ???
,'maximize' : 'FALSE'
,'n_jobs' : -1
#,'base_score' : ???
#,'max_delta_step' : ???
}
dtrain_mn = xgb.DMatrix(data = x_train.drop(columns = 'quality'), label = y_train)
dval_mn = xgb.DMatrix(data = x_val.drop(columns = 'quality'), label = y_val)
dtest_mn = xgb.DMatrix(data = x_test.drop(columns = 'quality'), label = y_test)
# specify validations set to watch performance
watchlist = [(dtrain_mn, 'train'), (dval_mn, 'eval')]
num_round = 25 #This is another hyperparameter of sorts
bst = xgb.train(xgb_param_mn, dtrain_mn, num_round, watchlist, early_stopping_rounds = 10)
preds = bst.predict(dtrain_mn)
preds_test = bst.predict(dtest_mn)
best_preds = np.asarray([np.argmax(line) for line in preds])
best_preds_test = np.asarray([np.argmax(line) for line in preds_test])
best_preds_df_train = pd.DataFrame(best_preds).add_prefix('PRED_QUAL')
best_preds_df_test = pd.DataFrame(best_preds_test).add_prefix('PRED_QUAL')
best_preds_df_test.head()
y_train_df = pd.DataFrame(y_train).add_prefix('QUALITY')
y_train_df = y_train_df.reset_index()
y_train_df['PRED_QUALITY'] = best_preds_df_train['PRED_QUAL0']
y_train_df['CORRECT_PREDS'] = np.where(y_train_df['PRED_QUALITY'] == y_train_df['QUALITYquality'], 1, 0)
y_test_df = pd.DataFrame(y_test).add_prefix('QUALITY')
y_test_df = y_test_df.reset_index()
y_test_df['PRED_QUALITY'] = best_preds_df_test['PRED_QUAL0']
y_test_df['CORRECT_PREDS'] = np.where(y_test_df['PRED_QUALITY'] == y_test_df['QUALITYquality'], 1, 0)
y_test_df.head()
sum(y_train_df['CORRECT_PREDS'])/len(y_train_df)
sum(y_test_df['CORRECT_PREDS'])/len(y_test_df)
lgb_params_mn = {
'boosting_type': 'gbdt',
'objective': 'multiclass', # also have multiclassova
'metric': 'multi_logloss',
'num_class' : 6,
'max_depth' : 3,
#'num_leaves' : ???
'learning_rate': 0.1,
#'num_threads' : -1,
#'scale_pos_weight' : ???
'early_stopping_round' : 10,
# min_data_in_leaf = ???,
# pos_bagging_fraction = ???,
# neg_bagging_fraction = ???,
# bagging_freq = ???,
# max_delta_step = ???,
#'top_rate' : ???
#'other_rate' : ???
#'lambda_l1' : ???
#'lambda_l2' : ???
}
lgb_wine_train = lgb.Dataset(x_train.drop(columns = 'quality'), y_train)
lgb_wine_val = lgb.Dataset(x_val.drop(columns = 'quality'), y_val)
lgb_gbm_nm = lgb.train(params = lgb_params_mn, train_set = lgb_wine_train,
num_boost_round = 100, valid_sets = [lgb_wine_val, lgb_wine_train],
valid_names = ['Evaluation', 'Train'])
lgb_mn_preds_train = lgb_gbm_nm.predict(x_train.drop(columns = 'quality'))
lgb_mn_preds_test = lgb_gbm_nm.predict(x_test.drop(columns = 'quality'))
lgb_gbm_nm.print_evaluation
best_lgb_preds_train = np.asarray([np.argmax(line) for line in lgb_mn_preds_train])
best_lgb_preds_test = np.asarray([np.argmax(line) for line in lgb_mn_preds_test])
best_lgb_preds_train = pd.DataFrame(best_lgb_preds_train).add_prefix('PRED_QUAL')
best_lgb_preds_test = pd.DataFrame(best_lgb_preds_test).add_prefix('PRED_QUAL')
best_lgb_preds_test.head()
y_train_df['PRED_QUALITY_LGB'] = best_lgb_preds_train['PRED_QUAL0']
y_train_df['CORRECT_PREDS_LGB'] = np.where(y_train_df['PRED_QUALITY_LGB'] == y_train_df['QUALITYquality'], 1, 0)
y_test_df['PRED_QUALITY_LGB'] = best_lgb_preds_test['PRED_QUAL0']
y_test_df['CORRECT_PREDS_LGB'] = np.where(y_test_df['PRED_QUALITY_LGB'] == y_test_df['QUALITYquality'], 1, 0)
y_test_df.head()
sum(y_train_df['CORRECT_PREDS_LGB'])/len(y_train_df)
sum(y_test_df['CORRECT_PREDS_LGB'])/len(y_test_df)